***************************************************************
//					~ Resetting Stata ~
***************************************************************

// Declare Stata Version
version 16

// Reset Stata Parameters
clear all
set maxvar 30000
set more off
clear matrix
clear mata
eststo clear
log close _all
timer clear
set seed 49740

***************************************************************
//					 ~ Path Directories ~
***************************************************************

// Base Directory
	// TODO: You need to change the base directory to where you saved the replication package.
global dir "C:/Users/username/replication_packet"   

// Folder Navigation
global dir_data		"$dir/data"
global dir_outputs	"$dir/outputs"

// Output Preference: only one at a time
global excel "off"
global latex "on"
	if "$excel" == "on" {
		global output "excel"
		global file "xls" 
	}
	if "$latex" == "on" {
		global output "tex"
		global file "tex"
	}
	

***************************************************************
//					 ~ Dependencies ~
***************************************************************

// These are the packages you will need to run this script
local package_list outreg2 distinct

// Flag in the beginning to download ado files needed to run
foreach package of local package_list {
	cap which `package'
	if _rc di "{red: This script needs -`package'-, please install first (try -ssc install `package'-)}"
	if _rc stop
}

***************************************************************
//   				Toggle Sections
***************************************************************		

// MAIN PAPER
global preface		"on"
global table_1		"on"
global table_2		"on"
global figure_1		"on"  
global figure_2		"on"
global intext_stats	"on"  


***************************************************************
//   				Preface
***************************************************************	
if "$preface" == "on" {
	
use "${dir_data}/sd_hh_full", clear
	// sample
	sum *_sample // pre-baseline r1 obs: 2,415; baseline r2 obs: 2226; endline r3 obs: 2117
	assert r1_sample==1 // every respondent was surveyed pre-baseline
	tab r2_sample r3_sample, mi
	// consent
	assert r1_consent1==1 // every respondent consented to the study at pre-baseline
	assert r2_sample==0 & r3_sample==0 if r1_consent2==0 // if respondents did not consent to follow-up surveys, they were not surveyed at baseline or endline
}

			
***************************************************************
//   				Table 1
***************************************************************	
if "$table_1" == "on" {
  
use "${dir_data}/sd_hh_endline", clear
  
	// toggle global output for frmttable
	if "$latex" != "on" global output ""
	
	// PRE-BASELINE AND BASELINE: 
	global sumsdvars1 "sd_support_yn sd_support_10hhs sd_pap sd_self sd_govtrec_yn sd_index_gmed" 	
	global sumsdvars2 "sd_others sd_otherhh sd_leader"
	
	local vars ${sumsdvars1}

	local numvars : word count `vars'
	tokenize `vars'
	forvalues i = 1/`numvars'  {
		sum r1_``i'' 
			local r1_N = r(N)
			local r1_mean = r(mean)
			local r1_sd = r(sd)
		sum r2_``i'' 
			local r2_N = r(N)	
			local r2_mean = r(mean)
			local r2_sd = r(sd)
		ttest r1_``i''=r2_``i'' // paired ttest for those surveyed in both rounds
			local N = r(N)
			di "`N'"
			local pval = r(p)
		mat b = (`r1_N',`r1_mean',`r1_sd',`r2_N',`r2_mean',`r2_sd',`pval')
		if "``i''"=="sd_pap" mat b = (.,.,.,`r2_N',`r2_mean',`r2_sd',.) // baseline only
		if `i'==1 {
				mat d = b
			}
			else {
				mat d = d\b  
			}
	}
		
	// BASELINE ONLY	
	foreach i in $sumsdvars2 {
		sum r2_`i' 
			local r2_N = r(N)	
			local r2_mean = r(mean)
			local r2_sd = r(sd)
		mat b = (.,.,.,`r2_N',`r2_mean',`r2_sd',.)
			mat d = d\b  	
	}
	
	*mat rownames d = `vars'
	frmttable using "${dir_outputs}/table_1", $output statmat(d) replace ///
		ctitles("VARIABLES","{Pre-Baseline}","","","{Baseline}","","","T-test" \ "","N","Mean","SD","N","Mean","SD","p-value") ///
		rtitles("Respondent supports social distancing (SD)"\"Perceived share of community supporting SD"\"Primary SD indicator"\"Self-report SD indicator"\"Self-report: Followed govt rules in past 14 days"\"Self-report: SD behaviors above median"\"Others SD indicator"\"Other households’ report of SD"\"Leaders’ report of SD")  ///
		multicol(1,2,3;1,5,3) sdec(0,3,3,0,3,3,3)

	// restore toggle global output
	if "$latex" != "on" global output "excel"
	
	// paired ttest of actual vs perceived community support in each rounds
	ttest r1_sd_support_yn=r1_sd_support_10hhs // pre-baseline
	ttest r2_sd_support_yn=r2_sd_support_10hhs // baseline
	
}
	
	
***************************************************************
//					Table 2
***************************************************************
if "$table_2" == "on" {
 
use "${dir_data}/sd_hh_endline", clear
 
	// models
	global model1 "sd1_commsupp sd2_endorse"
	global model2 "sd1_commsupp sd2_endorse sd1_commsupp_r3_pccases_dlvl sd2_endorse_r3_pccases_dlvl"
	// outcome variables
	global sdvars "sd_pap c25_2"
	// table details
	global title "Table 2: Regression of Social Distancing (SD) Treatments"
	
	// Rename baseline values
	cap rename r1_c25_2 r2_c25_2
	
	// Reset Table Replace
	local replace replace

// Regressions
	// Forloop for sdtreat regressions
	foreach yvar in $sdvars {  
		
		// dummy out missing control variables
		clonevar r2_yvar = r2_`yvar'
		gen r2_yvar_msg = r2_`yvar' == . 
		replace r2_yvar = 0 if r2_yvar_msg == 1
		
		// Main Effects
		qui: areg r3_`yvar' ${model1} r2_yvar r2_yvar_msg i.r2_leader_know_count i.r2_otherhh_know_count_bnd, a(id_school) robust
		qui: sum r3_`yvar' if e(sample) == 1 & sdtreat == 0 
			local my = r(mean)
			qui: estadd scalar my = r(mean)
			local sdy = r(sd)
			qui: estadd scalar sdy = r(sd)

			outreg2 using "${dir_outputs}/table_2.${file}", $output label ///
			addstat("Control Mean DV", `my',"Control SD DV",`sdy') keep(sd1_commsupp sd2_endorse) ///
			nocons bdec(4) sdec(4) adec(4) ///
			`replace' // add replace option to first outreg to clear previous ones
		
			local replace
			
		// Heterogenous Effects
		qui: areg r3_`yvar' ${model2} r2_yvar r2_yvar_msg i.r2_leader_know_count i.r2_otherhh_know_count_bnd, a(id_school) robust
		qui: sum r3_`yvar' if e(sample) == 1 & sdtreat == 0 
			local my = r(mean)
			qui: estadd scalar my = r(mean)
			local sdy = r(sd)
			qui: estadd scalar sdy = r(sd)
		
			outreg2 using "${dir_outputs}/table_2.${file}", $output label ///
			addstat("Control Mean DV", `my',"Control SD DV",`sdy') keep(sd1_commsupp sd2_endorse sd1_commsupp_r3_pccases_dlvl sd2_endorse_r3_pccases_dlvl) ///
			nocons bdec(4) sdec(4) adec(4) ///
			`replace' // add replace option to first outreg to clear previous ones
			
			local replace
			drop r2_yvar*
			
		}				
}


***************************************************************
//					Figure 1
***************************************************************	
if "$figure_1" == "on" {

// NOTE: Figure was made using Adobe Illustrator. Statistics in figure are replicated below: 

use "${dir_data}/sd_hh_endline", clear

	// 1) Self-report: observing government SD recommendations?
	count if r2_sd_govtrec_yn==0 // No
		display r(N) / _N // percent of households
		local no1 = r(N)
	count if r2_sd_govtrec_yn==1 // Yes
		display r(N) / _N // percent of households

	// 2) Self-report: doing at least 7 out of 8 SD actions?
	count if r2_sd_govtrec_yn==1 & r2_sd_index_gmed==0 // No
		display r(N) / _N // percent of households
		local no2 = r(N)
	count if r2_sd_govtrec_yn==1 & r2_sd_index_gmed==1 // Yes
		display r(N) / _N // percent of households		
		
	// 3) Others report respondent is doing SD?
	count if r2_sd_govtrec_yn==1 & r2_sd_index_gmed==1 & r2_sd_others==0 // No
		display r(N) / _N // percent of households
		local no3 = r(N)
	count if r2_sd_govtrec_yn==1 & r2_sd_index_gmed==1 & r2_sd_others==1 // Yes	
		display r(N) / _N // percent of households

	// FINAL SOCIAL DISTANCING measure
	tab r2_sd_pap 
		// No: sum of three "No" counts from above
		display `no1'+`no2'+`no3'
		display (`no1'+`no2'+`no3')/_N		
		// Yes: same as the "Yes" count after question 3 above

}
		

***************************************************************
//					Figure 2
***************************************************************	
if "$figure_2" == "on" {

/* NOTE:
Paper's figure was made using Adobe Illustrator. An analogous figure is replicated below.

Key difference: In the paper's figure, we scale the x-axis based on the district-level cumulative COVID-19 cases per 100,000 at the start of the endline survey. In the replication, these cases are only listed as the "label" for each district.
*/

 
use "${dir_data}/sd_hh_endline", clear
 
	// Models
	global model1 "sd1_commsupp sd2_endorse"
		
	// Outcome variables
	global sdvars "sd_pap_d6 sd_pap_d3 sd_pap_d1 sd_pap_d2 sd_pap_d5 sd_pap_d7 sd_pap_d4"
	global yvars "$sdvars"  // MUST BE CORRECT ORDER
	global numy: word count $yvars	

	// Figure options
	global ext "png" 
	set scheme s2color
	global title_opts "title("District-Level Misperceptions Correction" "Treatment Effects by COVID-19 Cases")"
	global marker_opts "mlabel mlabp(12) mlabsize(medium) format(%9.1f) "
	global xaxis_opts "xtitle("District-Level Cumulative Cases per 100,000 at Endline") xlabel(, format(%9.1f))"
	global yaxis_opts "ytitle("District-Level Treatment Effect on Social Norm Correction" "on Social Distancing Indicator (Percentage Points)") ylabel(, format(%9.0f))"
	global ci_opts "ciopts(recast(rcap))"
	global opts "$title_opts $marker_opts $xaxis_opts $yaxis_opts $ci_opts " 
	
	/*
		set scheme plottigblind
		global opts "format(%9.3f) xline(0) msize(medium) mlabel mlabp(12) mlabsize(medium)"
		global opts "$opts ciopts(recast(rcap)) xscale(range(-.01 .08)) xlabel(0 (.02).08)"
		global opts "$opts legend(rows(1) position(6) size(medium))"
	*/
	
// SETUP:
	 // Create new variable and store control summ stats for variable labels
	foreach yvar in $yvars {  
	qui: sum r3_`yvar' if ktreat == 0 
		local mu`yvar' = round(r(mean),.001)	
		local sd`yvar' = round(r(sd),.001)
		gen `yvar'=.
	}
		// Social Distancing 
		label var sd_pap_d1 `"{bf:4.1}"'
		label var sd_pap_d2 `"{bf:4.3}"'
		label var sd_pap_d3 `"{bf:3.6}"'
		label var sd_pap_d4 `"{bf:39.1}"'
		label var sd_pap_d5 `"{bf:9.3}"'
		label var sd_pap_d6 `"{bf:1.7}"'
		label var sd_pap_d7 `"{bf:28.8}"'
	
	// Create matrices to store results - one for each treatment variable
		foreach x in $model1 {
		matrix `x' = J(1,$numy,.)
		matrix colnames `x' = $yvars
		matrix `x'ci = J(2,$numy,.)
		matrix colnames `x'ci = $yvars
		matrix rownames `x'ci = ll95 ul95
		}
			 
	local i 0
	// Store regression estimates in matrices: new column for each outcome
		// sdvars: sd_pap sd_pap_chimoio sd_pap_others
	foreach yvar in $sdvars {  
		local ++ i

		areg r3_`yvar' ${model1} r2_sd_pap i.r2_leader_know_count i.r2_otherhh_know_count_bnd, a(id_school) robust

			foreach x in ${model1} {
			matrix `x'[1,`i'] = _b[`x']*100
			matrix `x'ci[1, `i'] = _b[`x']*100 - 1.96*_se[`x']*100 \ _b[`x']*100 + 1.96*_se[`x']*100 // multiply by 100 to put in percentage point units
			}
	}		
	
	// Rename matrices for treatment variables in legend
	matrix rename sd1_commsupp Misperceptions_Correction
	matrix rename sd2_endorse Leader_Endorsement
	
// OUTPUT:

	matrix list Misperceptions_Correction
	matrix list sd1_commsuppci
	sum r3_sd_pap_d4 if sdtreat==0
	// For district 4: treatment effect is 70% increase over that district's control-group mean.
	
*FIGURE 2:
coefplot (matrix(Misperceptions_Correction), ci(sd1_commsuppci) msymbol(triangle)) ///
		 , keep($yvars) $opts vertical
	graph export "${dir_outputs}/figure_2.png", replace

}


***************************************************************
//   				In-Text Statistics
***************************************************************		
if "$intext_stats" == "on" {
/// List of in-text statistics

/// SECTION 1: 
	// Text: Early in the pandemic, 98% of our Mozambican sample thought that people should be social distancing, but estimated that only 69% of others in the community felt similarly.
		// See Table 1, rows (1)-(2), under Pre-Baseline.

/// SECTION 3.1
	// Text: There was a minimum of 3.0 weeks and average of 6.3 weeks between baseline and endline surveys for all respondents.
	use "${dir_data}/sd_hh_endline", clear
		gen r3_dofc = dofc(r3_submissiondate) // endline
		gen r2_dofc = dofc(r2_submissiondate) // baseline
		gen r2r3_days = r3_dofc - r2_dofc // # days b/n baseline and endline
		gen r2r3_weeks = r2r3_days / 7 // # weeks b/n baseline and endline
		sum r2r3_weeks

	// Text: The retention rate between baseline and endline is 95.1% overall, at least 94.4% in each of the seven districts surveyed, and balanced across treatment conditions. 
	use "${dir_data}/sd_hh_full", clear
		// generate retention
		gen r2r3_retention = r3_sample==1 if r2_sample==1
		label var r2r3_retention "Dummy if retained between R2 \& R3"
			label define retention 0 "Attrited" 1 "Retained"
			label values r2r3_retention retention	
		// once you open new attrition dataset
		sum r2r3_retention	
		tab id_district, sum(r2r3_retention)
		// See Table A.5 Column (1) for evidence that attrition (and by extension, retention) is balanced across treatment conditions

	// Text: We also surveyed 145 community opinion leaders over the 76 study communities—at least one, an average of 2.11, and at most 4 per community—for inputs to the primary outcome and treatments as described below.
	use "${dir_data}/sd_leader.dta", clear
		distinct leader_id id_school // 145 community opinion leader, 76 communities (defined around schools)
		collapse (max) leader_yn, by(leader_id id_school)
		collapse (sum) leaders_count=leader_yn, by(id_school)
		sum leaders_count	
		
/// SECTION 3.3

	// Text: At endline, respondents were asked about all eight social distancing actions and, with a sample median of six, had to report doing seven or eight actions to be considered social distancing.
	use "${dir_data}/sd_hh_endline", clear
		sum r3_sd_index, detail // median is 6
		
	// Footnote 9: The average community leader was asked about 33.90 households (std. dev.=22.10, minimum=2, second-highest=99, maximum=228—a special case where one individual was the traditional leader across multiple communities).
	use "${dir_data}/sd_leader.dta", clear
		bysort leader_id: gen contacts_ttl=_N
		collapse contacts_ttl, by(leader_id)
		sum contacts_ttl, detail

	// Text: At baseline, the average respondent household was known by 0.98 community leaders and 3.21 neighboring survey respondents.
	use "${dir_data}/sd_hh_endline", clear	
		sum r2_leader_know_count
		sum r2_otherhh_know_count
		
	// Footnote 11: At baseline, 90.55% of respondent households were known by some other respondent or community leader.
	use "${dir_data}/sd_hh_endline", clear	
		sum r2_others_know_yn	
		
	// Footnote 12: For example, complete lack of observation by others was true for 9% of the sample (see footnote 11).
	use "${dir_data}/sd_hh_endline", clear
		sum r2_others_notknow_yn

/// SECTION 4.1 
	// Text: Sample sizes by treatment condition were as follows: misperceptions correction (N=628, 29.7% of sample), leader endorsement (N=637, 30.1%), and control group (N=852, 40.3%).
	use "${dir_data}/sd_hh_endline", clear
		tab sdtreat
		
	// Footnote 13: In 63 out of 76 communities (82.9%) the number we convey to respondents is 10 out of 10, and in 13 communities (17.1%) the number is 9 out of 10.
	use "${dir_data}/sd_hh_full", clear
		// NOTE: Community support for SD was calculated using pre-baseline (r1) data.
		// start with non-missing responses to individual responses to "Do you support social distancing?"
		// also excluded 7 responses of those who did not consent to future surveying (though, in hindsight, we could have left in these observations for this calculation)
		gen sd_support = r1_c10_1 if r1_consent2==1 
		// take the average across communities, which are defined by the school following Yang, et al. (2023) ""Knowledge, Stigma, and HIV Testing: An Analysis of a Widespread HIV/AIDS Program"
		bysort id_school: egen sd_support_commavg = mean(sd_support)
		// transform statistic "out of 10 members in your community" for simplicity (as pre-specified)
		gen sd1_commsupp_input = round(sd_support_commavg*10)
		// collapse to school level to generate in-text output				
		collapse (max) sd1_commsupp_input, by(id_school)
		tab sd1_commsupp_input
			
	// Text: In practice, 92.4% of treated respondents received this treatment, 53.2% of whom underestimated community support for social distancing and 46.8% of whom correctly estimated it.
	use "${dir_data}/sd_hh_full", clear
		// generate community-level input for SD1 treatment
		gen sd_support = r1_c10_1 if r1_consent2==1
		bysort id_school: egen sd_support_commavg = mean(sd_support)
		gen sd1_commsupp_input = round(sd_support_commavg*10)
		// generate dummy variables if respondent over, under or correctly estimated community support
		gen sd1_over = r2_c10_2 > sd1_commsupp_input
		gen sd1_under = r2_c10_2 < sd1_commsupp_input
		gen sd1_equal = r2_c10_2 == sd1_commsupp_input
		gen sd1_elig = r2_c10_2 <= sd1_commsupp_input
			label var sd1_over "Respondent overestimates community support for SD"
			label var sd1_under "Respondent underestimates community support for SD"
			label var sd1_equal "Respondent correctly estimates community support for SD"
			label var sd1_elig "Respondent is eligible for community support for SD treatment"
		// summarize of treated respondents: eligibility and, if so, under/equal
		sum sd1_elig if sdtreat==1 // dummy if eligible for treatment
		sum sd1_under sd1_equal if sdtreat==1 & sd1_elig==1 // of treated respondents who were eligible		

	// Footnote 15: Communities had at least one and an average of 2.09 endorsements from community leaders (std. dev.=0.94, maximum=4).
	use "${dir_data}/sd_leader.dta", clear
		distinct leader_id id_school // 145 community opinion leader, 76 communities (defined around schools)
		collapse (max) endorse_yn, by(leader_id id_school)
		collapse (sum) endorse_count=endorse_yn, by(id_school)
		sum endorse_count

	// Text: Attrition between baseline and endline is low (4.9%). 
	use "${dir_data}/sd_hh_full", clear
		// generate attrition variable
		gen r2r3_attrition = r3_sample==0 if r2_sample==1 
		label var r2r3_attrition "Dummy if attrited between R2 \& R3" 
		sum r2r3_attrition

	// Text: Further, at endline, 97.9% recall receiving the baseline survey and, of those, 99.4% report trusting the COVID-19 information we provided.
	use "${dir_data}/sd_hh_endline", clear
		sum r3_recall_r2_svy r3_trust_r2_covidinfo

/// SECTION 4.3
	// Text: The mean expert predictions were that the misperceptions correction and leader endorsement treatments would increase social distancing by 5.23 and 5.56 percentage points, respectively.	
	use "${dir_data}/sd_expert_predictions", clear 
	sum pred_sd1_1 pred_sd2_1
 	
/// SECTION 5.2
	// Text: We strongly reject the null that our T1 and T2 treatment effect estimates are equal to the positive mean expert predictions (p-value<0.001 in each case).
	// Store predictions in globals
	use "${dir_data}/sd_expert_predictions", clear 
	qui sum pred_sd1_1 
		global sd1_predict = r(mean)
	qui sum pred_sd2_1
		global sd2_predict = r(mean)
	// Run primary regression
	use "${dir_data}/sd_hh_full", clear
		// dummy out missing control variables
		clonevar r2_yvar = r2_sd_pap
		gen r2_yvar_msg = r2_sd_pap == . 
		replace r2_yvar = 0 if r2_yvar_msg == 1
		// Regression
		areg r3_sd_pap sd1_commsupp sd2_endorse r2_yvar r2_yvar_msg i.r2_leader_know_count i.r2_otherhh_know_count_bnd, a(id_school) robust
	// Test against expert predictions
		test sd1_commsupp = $sd1_predict
		test sd2_endorse = $sd2_predict
		// Drop missing dummy	
			drop r2_yvar*
			
/// SECTION 5.3
	// Text: in Chimoio, the district with the most cases (39.08/100,000) that also accounts for one-quarter of the sample, we estimate a large positive effect: 9.2 percentage points—a 70% increase over that district's control group (statistically significant at the 5% level).
	use "${dir_data}/sd_hh_endline", clear
		sum r3_sd_pap if sdtreat==0 & district=="Chimoio" // control-group mean for Chimoio
		// Treatment effect from Figure 2 (replicated above) is 9.2pp
		// 9.2 (treatment effect) / 13.2 (control-group mean) = .6970
		// Therefore treatment effect is 70% increase over that district's control-group mean.
}
	